import pandas as pd
import numpy as np
import os
import datetime
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn import tree
import pytz
import itertools
import visualize
import utils
import pydotplus
import xgboost as xgb
from sklearn import metrics
import pvlib
import cs_detection
import visualize_plotly as visualize
from IPython.display import Image
%load_ext autoreload
%autoreload 2
np.set_printoptions(precision=4)
%matplotlib notebook
Only making ground predictions using PVLib clearsky model and statistical model. NSRDB model won't be available to ground measurements.
nsrdb = cs_detection.ClearskyDetection.read_pickle('srrl_nsrdb_1.pkl.gz')
nsrdb.df.index = nsrdb.df.index.tz_convert('MST')
nsrdb.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')
ground = cs_detection.ClearskyDetection.read_pickle('srrl_ground_1.pkl.gz')
ground.df.index = ground.df.index.tz_convert('MST')
ground.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')
feature_cols = [
'tfn',
'abs_ideal_ratio_diff',
'abs_ideal_ratio_diff mean',
'abs_ideal_ratio_diff std',
'abs_ideal_ratio_diff max',
'abs_ideal_ratio_diff min',
'GHI Clearsky GHI pvlib gradient ratio',
'GHI Clearsky GHI pvlib gradient ratio mean',
'GHI Clearsky GHI pvlib gradient ratio std',
'GHI Clearsky GHI pvlib gradient ratio min',
'GHI Clearsky GHI pvlib gradient ratio max',
'GHI Clearsky GHI pvlib gradient second ratio',
'GHI Clearsky GHI pvlib gradient second ratio mean',
'GHI Clearsky GHI pvlib gradient second ratio std',
'GHI Clearsky GHI pvlib gradient second ratio min',
'GHI Clearsky GHI pvlib gradient second ratio max',
'GHI Clearsky GHI pvlib line length ratio',
'GHI Clearsky GHI pvlib line length ratio gradient',
'GHI Clearsky GHI pvlib line length ratio gradient second'
]
target_cols = ['sky_status']
ground.df.index[0], ground.df.index[-1]
nsrdb.df.index[0], nsrdb.df.index[-1]
ground2 = cs_detection.ClearskyDetection(ground.df)
ground2.trim_dates('01-01-2008', '01-01-2012')
ground2.df = ground2.df[ground2.df.index.minute % 30 == 0]
nsrdb2 = cs_detection.ClearskyDetection(nsrdb.df)
nsrdb2.trim_dates('01-01-2008', '01-01-2012')
nsrdb2.df = nsrdb2.df[nsrdb2.df.index.minute % 30 == 0]
vis = visualize.Visualizer()
vis.add_line_ser(ground2.df['GHI'], 'Grnd GHI')
vis.add_line_ser(ground2.df['Clearsky GHI pvlib'], 'Grnd GHIcs')
vis.add_line_ser(nsrdb2.df['GHI'], 'NSRDB GHI')
vis.add_line_ser(nsrdb2.df['Clearsky GHI pvlib'], 'NSRDB GHIcs')
vis.show()
print(np.sqrt(np.mean((nsrdb2.df['GHI'] - ground2.df['GHI'])**2)))
print(np.sqrt(np.mean((nsrdb2.df['GHI'] - nsrdb2.df['Clearsky GHI pvlib'])**2)))
print(np.sqrt(np.mean((ground2.df['GHI'] - ground2.df['Clearsky GHI pvlib'])**2)))
ground2 = cs_detection.ClearskyDetection(ground.df)
ground2.trim_dates('01-01-2008', '01-01-2012')
nsrdb2 = cs_detection.ClearskyDetection(nsrdb.df)
nsrdb2.trim_dates('01-01-2008', '01-01-2012')
ground2.df['sky_status'] = nsrdb2.df['sky_status']
ground2.df['sky_status'].head()
ground2.df['sky_status'] = ground2.df['sky_status'].fillna(False)
ground2.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
ground3 = cs_detection.ClearskyDetection(ground2.df)
ground3.trim_dates('07-01-2011', '07-15-2011')
vis = visualize.Visualizer()
vis.add_line_ser(ground3.df['GHI'], 'Grnd GHI')
vis.add_line_ser(ground3.df['Clearsky GHI pvlib'], 'Grnd GHIcs')
vis.add_circle_ser(ground3.df[ground3.df['sky_status']]['GHI'], 'Clear')
vis.show()
utils.calc_all_window_metrics(ground2.df, 11, 'GHI', 'Clearsky GHI pvlib', overwrite=True)
ground2_train = ground2.df[ground2.df.index.minute % 30 == 0]
ground2_train = ground2_train[ground2_train.index < '07-01-2011']
from sklearn import tree
from sklearn import ensemble
import xgboost as xgb
# clf = ensemble.RandomForestClassifier(n_estimators=100)
# clf = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=.005, reg_lambda=.01)
clf = ensemble.RandomForestClassifier(max_depth=6, n_estimators=128, n_jobs=-1)
from sklearn import model_selection
scores = model_selection.cross_val_score(clf, ground2_train[feature_cols].values, ground2_train[target_cols].values.flatten())
np.mean(scores), np.std(scores)
clf.fit(ground2_train[feature_cols].values, ground2_train[target_cols].values.flatten())
ground2_test = cs_detection.ClearskyDetection(ground2.df)
ground2_test.trim_dates('07-01-2011', '07-15-2011')
pred = ground2_test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 11)
pred2 = pred[pred.index.minute % 30 == 0]
sky_stat = ground2_test.df[ground2_test.df.index.minute % 30 == 0]['sky_status']
cm = metrics.confusion_matrix(sky_stat, pred2)
vis = visualize.Visualizer()
vis.plot_confusion_matrix(cm, labels=['cloudy', 'clear'])
ghi = ground2_test.df['GHI']
ghi_cs = ground2_test.df['Clearsky GHI pvlib']
is_clear = ground2_test.df['sky_status iter']
sky = ground2_test.df['sky_status']
ghi = ghi[ghi.index < '07-15-2011']
ghi_cs = ghi_cs[ghi_cs.index < '07-15-2011']
sky = sky[sky.index < '07-15-2011']
vis = visualize.Visualizer()
vis.add_line_ser(ghi, 'ghi')
vis.add_line_ser(ghi_cs, 'ghi_cs')
vis.add_circle_ser(ghi[is_clear], 'clear')
vis.add_circle_ser(ghi[sky], 'nsrdb')
# vis.add_circle_ser(ghi[ghi.index.isin(sky.index)], 'clear nsrdb')
vis.show()